Urban Parks & Demographics

Code
#imports
import seaborn as sns 
import pandas as pd
import numpy as np 
from matplotlib import pyplot as plt
import altair as alt
import geopandas as gpd
import json
import folium
import pygris
import cenpy
import re
import copy
import warnings
import holoviews as hv
import hvplot.pandas 
from holoviews import opts
hv.extension('bokeh')
import ipywidgets as widgets
from ipywidgets import interact
from scipy.stats import pearsonr
# Suppress all warnings
warnings.filterwarnings('ignore')
from folium import Map, Marker
from IPython.display import IFrame
Code
#Check Parks
#read shape file 
shape_file_loc =  'parks_map/geo_export_46a7de00-0067-42f5-a6bc-bec64e5a0f0b.shp'

#convert it into geopanda dataframe 
def get_gpd_df(use_shape_file=True):
    if use_shape_file:
        gdf = gpd.read_file(shape_file_loc)
    return gdf
parks_gpd =  get_gpd_df()

#plot the map 
park_map = parks_gpd.explore(
    tiles="Cartodb positron",
        style_kwds={
        "weight": 2,
        "color": "green",
        "fillOpacity": 0.5,
        }
)
folium.TileLayer(opacity=0.20).add_to(park_map)
#create api conncection
ny_state_code = "36"
variables = ['NAME', 'B16008_002E' #Native population
             , 'B16008_019E' #Foreign-born population
             , 'B25064_001E' , #median_rent
             'B19013_001E'] #median_income
acs = cenpy.remote.APIConnection("ACSDT5Y2021")
NY_demo_data2021 = acs.query(
    cols=variables,
    geo_unit="tract:*",
    geo_filter={"state": ny_state_code,  "county":  "*"},
)
acs2 = cenpy.remote.APIConnection("ACSDT5Y2010")
NY_demo_data2010 = acs2.query(
    cols=variables,
    geo_unit="tract:*",
    geo_filter={"state": ny_state_code, "county": "*"},
)

NY_demo_data2010.dropna(inplace=True)
NY_demo_data2021.dropna(inplace=True)

#"061" Manhattan (New York County) 005 #Bronx 081 Queen  047  Brooklyn (Kings County) 085 Staten Island (Richmond County)
ny_city_counties = ["061" , "005" , "081", "047", "085"]

NY_demo_data2010 = NY_demo_data2010[NY_demo_data2010['county'].isin(ny_city_counties)]
NY_demo_data2021 = NY_demo_data2021[NY_demo_data2021['county'].isin(ny_city_counties)]


for variable in variables:
    if variable != "NAME":
        NY_demo_data2010[variable] = NY_demo_data2010[variable].astype(float)

for variable in variables:
    if variable != "NAME":
        NY_demo_data2021[variable] = NY_demo_data2021[variable].astype(float)
        
NY_demo_data2010['population2010'] = NY_demo_data2010['B16008_002E'] + NY_demo_data2010['B16008_019E']
NY_demo_data2010 = NY_demo_data2010.rename(columns={'B19013_001E': 'median_income2010' ,'B25064_001E': 'median_rent2010'})
NY_demo_data2010 = NY_demo_data2010[NY_demo_data2010['population2010']>0]
NY_demo_data2010 = NY_demo_data2010[NY_demo_data2010['median_income2010']>=0]

NY_demo_data2021['population2021'] = NY_demo_data2021['B16008_002E'] + NY_demo_data2021['B16008_019E']
NY_demo_data2021 = NY_demo_data2021.rename(columns={'B19013_001E': 'median_income2021' ,'B25064_001E': 'median_rent2021'})
NY_demo_data2021 = NY_demo_data2021[NY_demo_data2021['population2021']>0]
NY_demo_data2021 = NY_demo_data2021[NY_demo_data2021['median_income2021']>=0]

population_total_df = pd.merge(NY_demo_data2010, NY_demo_data2021 , on=['tract','county','NAME']) 
population_total_df['change_precent'] = ((population_total_df['population2021'] - population_total_df['population2010']) /(np.absolute(population_total_df['population2010'])))*100

shape_file_loc =  'census_tract_shapefile/tl_2021_36_tract.shp'
#convert it into geopanda dataframe 
def get_gpd_df(use_shape_file=True):
    if use_shape_file:
        gdf = gpd.read_file(shape_file_loc)
    return gdf
nyc_gpd =  get_gpd_df()

nyc_demo_merged = nyc_gpd.merge(
    population_total_df,
    left_on=["STATEFP", "COUNTYFP", "TRACTCE"],
    right_on=["state_x", "county", "tract"],)

nyc_pct_change_mean = nyc_demo_merged['change_precent'].mean()

nyc_demo_merged = nyc_demo_merged[['tract', 'geometry', 'population2010','population2021','change_precent'
                                  ,'median_income2010' , 'median_income2021',
                                  'median_rent2010','median_rent2021']]
nyc_demo_merged = nyc_demo_merged[nyc_demo_merged['change_precent'] < 300]

#| echo: true 
#| code-fold: true
def style(feature):
    return {
        'fillColor': 'green',  
        'color': 'black',      
        'weight': 2,          
        'fillOpacity': 0.6    
    }
def mapping(df,col,parks_gpd=parks_gpd):

    m = df.explore(column=col,cmap = 'Blues', 
                                tiles="CartoDB positron", zoom_start=11)
    folium.Marker(location=[40.747993, -74.004890], popup="The High Line" , icon=folium.Icon(icon='tree' ,color='red')).add_to(m)
    folium.Marker(location=[40.785091, -73.968285], popup="Central Park" , icon=folium.Icon(icon='tree' ,color='red')).add_to(m)
    folium.Marker(location=[40.665535, -73.969749], popup="Prospect Park" , icon=folium.Icon(icon='tree' ,color='red')).add_to(m)
    folium.Marker(location=[40.699215, -73.999039], popup="Brooklyn Bridge Park" , icon=folium.Icon(icon='tree' ,color='red')).add_to(m)
    folium.Marker(location=[40.739716, -73.840782], popup="Flushing Meadows Corona Park" , icon=folium.Icon(icon='tree' ,color='red')).add_to(m)
    folium.Marker(location=[40.703564, -74.016678], popup="Battery Park" , icon=folium.Icon(icon='tree' ,color='red')).add_to(m)

    folium.GeoJson(
        parks_gpd,
        name='geojson_layer',
        style_function=style
    ).add_to(m)
    
    return m 

Do urban parks influence migration to nearby neighborhoods?

Code
m = mapping(nyc_demo_merged,"change_precent",parks_gpd=parks_gpd)


title_html = '''
             <h3 align="center" style="font-size:16px"><b>Population Change from 2010 to 2021 in NYC</b></h3>
             '''
m.get_root().html.add_child(folium.Element(title_html))

# Save the map as HTML
html_path = "map1.html"
m.save(html_path)
# Display the HTML file as an iframe
IFrame(html_path, width=600, height=600)

After analyzing the census population growth from 2010 to 2021. It appears that there are noteworthy trends around the major urban parks in NYC.

-In Manhattan, Battery Park had the most significant population change compared to the parks such as Central Park.
-In Brooklyn, the park that stand out with the most significant population is Brooklyn Bridge Park with more than 150% increase in population growth.
-In Queens, the Flushing Meadows Corona Park area experienced the most significant growth among all NYC park areas. It’s worth noting that Flushing Park used to be the site of two twentieth-century World’s Fairs.
As for the High Line, an elevated linear park built on a historic freight rail line, it opened in 2009. There has been a moderate population growth in the neighborhoods around it.

Understanding these demographic shifts could aid urban planners and policymakers in their decision-making process

Did neighborhoods nearby urban parks experience a decrease in population between Census 2010 and Census 2020?

Code
nyc_demo_merged_sub_loss = nyc_demo_merged[nyc_demo_merged['change_precent'] < 0]
m2 = mapping(nyc_demo_merged_sub_loss,"change_precent",parks_gpd=parks_gpd)

title_html = '''
             <h2 align="center" style="font-size:16px"><b>Areas with population decrease from 2010 to 2021 in NYC</b></h3>
             '''
m2.get_root().html.add_child(folium.Element(title_html))
# Save the map as HTML
html_path = "map2.html"
m2.save(html_path)
# Display the HTML file as an iframe
IFrame(html_path, width=600, height=600)

The data reveals a 10% decrease in population in some census tracts around Central Park, Brooklyn Bridge Park and Prospect Park. This could be due to increased living costs and property values and development in urban projects in other areas

Does the variation in rental prices contribute to the population decline in areas experiencing a decrease?

Code
def compare(df, tracts , park_name):
    df.tract = df.tract.astype(str)
    df = df[df.tract.isin(tracts)]
    df = df[['median_income2010', 'median_income2021','median_rent2010' , 'median_rent2021']]
    mean = df.mean(axis=0).reset_index()
    mean = mean.rename(columns={
    'index': 'Variable',
    0: 'Values'})
    rent = mean[mean['Variable'].str.contains('rent')]
    income = mean[mean['Variable'].str.contains('income')]

    rent['Year'] = rent['Variable'].astype(str).str[-4:]
    income['Year'] = income['Variable'].astype(str).str[-4:]
    
    fig, axes = plt.subplots(1, 2, figsize=(6, 4), gridspec_kw={'width_ratios': [1, 1]})
    sns.barplot(x='Year', y='Values', data=rent, palette='pastel', ax=axes[0])
    axes[0].set_xlabel('year')
    axes[0].set_ylabel('Median Rent')
    axes[0].set_title('Median Rent In ' + park_name + ' Area' , fontsize=10)
    sns.barplot(x='Year', y='Values', data=income, palette='pastel', ax=axes[1])
    axes[1].set_xlabel('year')
    axes[1].set_ylabel('Median Income')
    axes[1].set_title('Median Income  In ' + park_name + ' Area' , fontsize=10)
    plt.tight_layout()
    # Show the plots
    plt.show()
Code
central = ['013000' , '012000' , '015001' , '015002']
compare(nyc_demo_merged_sub_loss, central , 'Central Park')

Code
brooklyn = [ '000301' ]
compare(nyc_demo_merged_sub_loss, brooklyn , 'Brooklyn Bridge Park')

Code
wsqaure = [ '006500' ]
compare(nyc_demo_merged_sub_loss, wsqaure , 'Washington Park')

Code
wsqaure = [ '003900' ]
compare(nyc_demo_merged_sub_loss, wsqaure , 'Silver Lake Park')

Overall, NYC median rent and median income has increased.
As for the parks areas, there highest increase in the median rent for properties is in Silver Lake Park and Brooklyn Bridge Park.

Is there a correlation between median income and the accessibility of parks?

Code
with open('indicators_data/2388.json', 'r') as file:
    park_walking_data = json.load(file)
    
park_walking_data = pd.DataFrame(park_walking_data)
park_walking_data = park_walking_data[park_walking_data.GeoType=="UHF42"]

shape_file_loc = 'UHF 42/UHF_42_DOHMH.shp'
#convert it into geopanda dataframe 
def get_gpd_df(use_shape_file=True):
    if use_shape_file:
        gdf = gpd.read_file(shape_file_loc)
    return gdf
uhf_gpd =  get_gpd_df()
uhf_gpd.to_crs(epsg = "4326", inplace = True)

merged_ny_walking = uhf_gpd.merge(
    park_walking_data,
    left_on=["UHF"],
    right_on=["GeoID"])

joined_data = gpd.sjoin(nyc_demo_merged, merged_ny_walking, how='left', op='within')
joined_data.dropna(inplace=True)

numerical_correlation, _ = pearsonr(joined_data['Value'], joined_data['median_income2010'])
print(f"Pearson's correlation coefficient for year 2010: {numerical_correlation}")

numerical_correlation, _ = pearsonr(joined_data['Value'], joined_data['median_income2021'])
print(f"Pearson's correlation coefficient: for year 2021 {numerical_correlation}")

m = mapping(joined_data,"Value",parks_gpd=parks_gpd)
# Save the map as HTML
title_html = '''
             <h2 align="center" style="font-size:16px"> <b> Park Walkability </b></h3>
             '''
m.get_root().html.add_child(folium.Element(title_html))
html_path = "map3.html"
m.save(html_path)


# Display the HTML file as an iframe
IFrame(html_path, width=600, height=600)
Pearson's correlation coefficient for year 2010: -0.263688800485939
Pearson's correlation coefficient: for year 2021 -0.12378378631103196

In order to understand whether parks are distributed equally, the correlation between park walkability and income was calculated for UF42 areas. United Hospital Fund (UHF) Neighborhoods comprise 42 adjoining zip code area designated to approximate the Community Planning Districts in NYC.

The correlation value is -0.26 for 2010 and -0.12 for 2021 which indicates a weak negative correlation. As median income increases, the “walking distance to a park” percentage tends to decrease. While the correlation is weak, it may be caused by factors such as higher median income areas prioritizing other urban projects over parks. Moreover, individuals with higher median incomes are more likely to rely on private transportation, potentially Reducing the importance of proximity to parks in influencing their choice of residence.

Is there a correlation between the presence of families in nearby neighbourhoods and the availability of Children Playgrounds?

Code
shape_file_loc = 'CPA/geo_export_f2e1ecf0-ad20-4676-8b9d-ee96dfeef1bd.shp'

#convert it into geopanda dataframe 
def get_gpd_df(use_shape_file=True):
    if use_shape_file:
        gdf = gpd.read_file(shape_file_loc)
    return gdf
CPA_gpd =  get_gpd_df()


shape_file_loc = 'Athletic_Facilities/geo_export_35c572bd-a799-446c-890f-c78e4d293ea0.shp'

#convert it into geopanda dataframe 
def get_gpd_df(use_shape_file=True):
    if use_shape_file:
        gdf = gpd.read_file(shape_file_loc)
    return gdf
AF_gpd =  get_gpd_df()

#source https://docs.google.com/spreadsheets/d/1iIhwuLBlIus2n1EQ2a329jX4oJciXt9dEaxOFPpHfE8/edit?usp=sharing
sport_map = {
    'BKB': 'Basketball',
    'BSB': 'Baseball',
    'CRK': 'Cricket',
    'FFB': 'Flag Football',
    'FRS': 'Frisbee',
    'FTB': 'Football',
    'HDB': 'Handball',
    'HKY': 'Hockey',
    'KBL': 'Kickball',
    'LCS': 'Lacrosse',
    'MPPA': 'Multi Purpose Play Area',
    'NTB': 'Netball',
    'RBY': 'Rugby',
    'SCR': 'Soccer',
    'SFB': 'Softball',
    'TNS': 'Tennis',
    'TRK': 'Track',
    'VLB': 'Volleyball',
    'WFB': 'Wheelchair Football'
}

AF_gpd['primary_sp'] = AF_gpd['primary_sp'].replace(sport_map)
AF_count = AF_gpd.groupby('primary_sp').size().reset_index(name='Count')

plt.figure(figsize=(6, 4))
ax = sns.barplot(x='primary_sp', y='Count', data=AF_count, palette='Set2')
ax.set_title('Sports Facilities in NYC Parks' , fontsize=16)
ax.set_xlabel('Sport')
ax.set_ylabel('count')
plt.xticks(rotation=45, ha='right');

It can be seen from the plot above that the most frequent sports facilites in NYC parks are basketball , hockey followed by Tennis. As it’s known that the favorite sports in the US are Basketball and American Football.

Code
import folium
from folium.plugins import MarkerCluster

m = folium.Map(location=[40.747993, -74.004890] , tiles="Cartodb positron", zoom_start=10)
playgrounds_cluster = MarkerCluster(name='Children Play Area').add_to(m)
for idx, row in CPA_gpd.iterrows():
    folium.Marker(location=[row['geometry'].centroid.y, row['geometry'].centroid.x], popup=row['name']).add_to(playgrounds_cluster)

athletic_cluster = MarkerCluster(name='Athletic Facilities').add_to(m)
for idx, row in AF_gpd.iterrows():
    folium.Marker(location=[row['geometry'].centroid.y, row['geometry'].centroid.x] ,popup=row['primary_sp']).add_to(athletic_cluster)

folium.LayerControl().add_to(m)


title_html = '''
             <h2 align="center" style="font-size:16px"> <b>Distrubtion of Athletic Facilities and Playground Areas</b></h3>
             '''
m.get_root().html.add_child(folium.Element(title_html))


html_path = "FA_CPA_Map.html"
m.save(html_path)
# Display the HTML file as an iframe
IFrame(html_path, width=600, height=600)

It can be seen from the map above that athletic Facilities and playground Areas are mostly distrubuted in Manhattan and Queens compared to to other boroughs

Code
vars_fam = ['NAME', 'B11012_003E', 'B11012_006E', 'B11012_010E','B11012_015E']

acs = cenpy.remote.APIConnection("ACSDT5Y2021")
NY_fam_data2021 = acs.query(
    cols=vars_fam,
    geo_unit="tract:*",
    geo_filter={"state": ny_state_code,  "county":  "*"},
)
NY_fam_data2021 = NY_fam_data2021[NY_fam_data2021['county'].isin(ny_city_counties)]
for variable in vars_fam:
    if variable != "NAME":
        NY_fam_data2021[variable] = NY_fam_data2021[variable].astype(float)
        
NY_fam_data2021['Total_Families'] = NY_fam_data2021[['B11012_003E', 'B11012_006E', 'B11012_010E','B11012_015E']].sum(axis=1)

nyc_fam_merged = nyc_gpd.merge(
    NY_fam_data2021,
    left_on=["STATEFP", "COUNTYFP", "TRACTCE"],
    right_on=["state", "county", "tract"],)
Code
joined_data2 = gpd.sjoin(CPA_gpd, nyc_fam_merged, how="inner", op="within")
joined_data2.drop_duplicates(subset=['location','county'], inplace=True)

playgrounds_per_tract = joined_data2.groupby(['tract', 'county']).size().reset_index(name='CPA_Count')
CPA_fam_count = nyc_fam_merged.merge(playgrounds_per_tract, how='left', left_on=['tract','county'], right_on=['tract','county'])
CPA_fam_count.dropna(inplace=True)
CPA_fam_count = CPA_fam_count[CPA_fam_count.Total_Families>0]

correlation_coefficient, p_value = pearsonr(CPA_fam_count['CPA_Count'], CPA_fam_count['Total_Families'])

correlation_coefficient
0.07816702466699046
Code
%matplotlib inline
plt.figure(figsize=(7, 3))
sns.scatterplot(x='Total_Families', y='CPA_Count', data=CPA_fam_count)
plt.title('Number of Playgrounds vs Total Families')
plt.xlabel('families count')
plt.ylabel('Number of children Play Area')
plt.show()

Parks play a vital role in enhancing children’s well-being through spaces for physical activities like playgrounds and sports facilities. Investigating the correlation between the number of families and the availability of children’s playgrounds can aid in future planning for facilities and contribute to overall community well-being.

the correlation between the number of playgrounds and families in the NYC was calculated. It’s 0.078 suggests a very weak positive correlation.

Factors that could lead to this include:
1- other urban plannings projects priotrization
2-economic factors in the areas which contribute to whether an investment in playgrounds are possible
3- the presence of private areas that are considered as alternatives to public parks
4-Data limitation, since the result was acquired through spatial merge between families census tracts geospatial distrubtion and the map of playgrounds in NYC parks

Is there a corrleation between regular exercise and the avaliability atheltic facilities at parks ?

Code
with open('indicators_data/2060.json', 'r') as file:
    physical_activity_data = json.load(file)

physical_activity_data = pd.DataFrame(physical_activity_data)
physical_activity_data = physical_activity_data[physical_activity_data.GeoType=="UHF42"]
physical_activity_data.GeoID = physical_activity_data.GeoID.astype(float)

physical_activity_data = physical_activity_data[physical_activity_data.Time=='2019']
merged_ny_physical = uhf_gpd.merge(
    physical_activity_data,
    left_on=["UHF"],
    right_on=["GeoID"])

joined_gdf = gpd.sjoin(AF_gpd, merged_ny_physical, how='inner', op='intersects')

tract_aggregated = joined_gdf.groupby(['GeoID','borough' ])['system'].count().reset_index()
#system is the count of facilites in each area
merged_data = pd.merge(tract_aggregated, physical_activity_data, on=['GeoID'])
merged_data.drop_duplicates(inplace=True)
correlation_coefficient, p_value = pearsonr(merged_data['system'], merged_data['Value'])
correlation_coefficient
-0.2820924647154564
Code
plt.figure(figsize=(7, 3))
sns.scatterplot(x='Value', y='system', data=merged_data)
plt.title('Park Accessibility Vs Exercise rate')
plt.xlabel('% Of recent exercise per area responses')
plt.ylabel('Number of atheltic facilities')
plt.show()

Accessible athletic facilities could encourage physical activity which results in overall community well-being. The physical activity data was acquired from NYC Environment and Health Data Portal for the year 2017. They define Recent exercise as “Adults reporting exercise in the past 30 days (from when they were surveyed)”. In order to examine the relationship between the presence of public athletic facilities in NYC parks and physical activity levels, a correlation analysis was conducted.

Surprisingly, the result revealed a mild negative correlation (-28). It is crucial to note that the NYC Environment and Health Data Portal emphasizes that their data is based on estimates from small numbers and should be interpreted with caution.

Further studies could be conducted to determine the significance of this correlation, as it would provide valuable insights. Nowadays, there is a growing preference for personal trainers and private gyms over traditional athletic facilities. Additionally, it’s possible that the distribution of athletic facilities does not align with the exercise habits of the population in a given area.

Is there a corrleation between exercise and accessibility of parks?

Code
#Physical Activity 
with open('indicators_data/2060.json', 'r') as file:
    physical_activity_data = json.load(file)
    
with open('indicators_data/2388.json', 'r') as file:
    park_walking_data = json.load(file)
    
#read shape file for UHF 42
shape_file_loc = 'UHF 42/UHF_42_DOHMH.shp'

#convert it into geopanda dataframe 
def get_gpd_df(use_shape_file=True):
    if use_shape_file:
        gdf = gpd.read_file(shape_file_loc)
    return gdf
uhf_gpd =  get_gpd_df()
uhf_gpd.to_crs(epsg = "4326", inplace = True)


physical_activity_data = pd.DataFrame(physical_activity_data)
physical_activity_data = physical_activity_data[physical_activity_data.GeoType=="UHF42"]
physical_activity_data.GeoID = physical_activity_data.GeoID.astype(float)
merged_ny_physical = uhf_gpd.merge(
    physical_activity_data,
    left_on=["UHF"],
    right_on=["GeoID"])

plot = merged_ny_physical.hvplot(
    c="Value",
    groupby="Time",
    frame_width=600,
    frame_height=600,
    geo=True,
    dynamic=False,
    cmap="Greens",
    hover_cols=["GEOID"]
)
plot.opts(title='Recent Exercise in NY by Neighbourhood')

It can be seen from the map that the borough with the highest recent physical activity is Manhattan for the year 2020.

Code
#ref:
#https://a816-dohbesp.nyc.gov/IndicatorPublic/data-explorer/physical-activity/?id=2060

park_walking_data = pd.DataFrame(park_walking_data)
park_walking_data = park_walking_data[park_walking_data.GeoType=="UHF42"]

physical_activity_data = physical_activity_data[physical_activity_data.Time=='2017']

merged_ny_walking = uhf_gpd.merge(
    park_walking_data,
    left_on=["UHF"],
    right_on=["GeoID"])

plot = merged_ny_walking.hvplot(
    c="Value",
    groupby="Time",
    frame_width=600,
    frame_height=600,
    geo=True,
    dynamic=False,
    cmap="Greens",
    hover_cols=["GEOID"]
)
plot.opts(title='Walking Distance to a Park in NY by Neighbourhood in 2017')

Manhattan has the most park walkability among other boroughs.The could be due to parks density since Manhattan is the most densely populated borough in NYC

Code
merged_walking_corr = physical_activity_data.merge(
    park_walking_data, on=['GeoID', 'Time'])
correlation_coefficient, p_value = pearsonr(merged_walking_corr.Value_x,merged_walking_corr.Value_y)
#| echo: true 
#| code-fold: true
plt.figure(figsize=(7, 3))
sns.scatterplot(x='Value_x', y='Value_y', data=merged_walking_corr)
plt.title('Park Accessibility Vs Exercise rate')
plt.xlabel('% Of recent exercise per area responses')
plt.ylabel('% of Accessability to Parks per responses')
plt.show()
correlation_coefficient

0.28957159180561953

A correlation of 0.28 suggests a weak positive correlation As the walkability to the park in an area increases, the rate of exercise in that area tends to increase slightly.
When parks are more walkable, residents are more likely to utilize them for physical activities Also it promotes the neighborhood community to engage in social activites that would motiavte more residents.